Code
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import silhouette_score| HAPPY | SATIS_A | SATIS_B | CHNG_A | CHNG_B | CHNG_C | DIVRELPOP | DIVRACPOP | QB2A | QB2C | ... | YEARLOCREC | MOVED | REG | PARTY | IDEO | HH1REC | INTFREQ | INC_SDT1 | GENDER | FERTREC | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2 | 2 | 2 | 3 | 2 | 2 | 3 | 2 | 2 | ... | 4 | 1 | 1 | 1 | 1 | 3 | 2 | 2 | 2 | 1 |
| 1 | 1 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | ... | 3 | 2 | 1 | 2 | 5 | 3 | 2 | 7 | 2 | 2 |
| 2 | 2 | 2 | 3 | 3 | 1 | 1 | 1 | 1 | 1 | 2 | ... | 4 | 1 | 1 | 3 | 3 | 2 | 1 | 7 | 2 | 0 |
| 3 | 3 | 2 | 4 | 2 | 2 | 2 | 2 | 2 | 2 | 1 | ... | 1 | 2 | 1 | 3 | 1 | 1 | 1 | 6 | 1 | 0 |
| 4 | 2 | 4 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | ... | 2 | 2 | 1 | 2 | 4 | 2 | 2 | 5 | 2 | 0 |
5 rows × 101 columns
# Reload original DataFrame if it's been modified
df = pd.read_csv("../data/religion_data_no99.csv")
# from Best subset selection in mlp.ipynb
best_feature_indices = [0, 7, 8, 10, 11, 13, 14, 16, 17, 19, 21, 22, 24, 25, 26, 27, 29, 30, 32, 33, 35, 36, 37, 38, 44, 47, 50, 53, 54, 57, 60, 62, 63, 64, 65, 70, 71, 74, 75, 77, 80, 81, 82, 83, 85, 86, 88, 90, 91, 92]
# Convert to list of column names from index positions
selected_names = [df.columns[i] for i in best_feature_indices if i < len(df.columns)]
print(selected_names)['HAPPY', 'DIVRACPOP', 'QB2A', 'QB2D', 'OPENIDEN', 'GOVSIZE1', 'ABRTLGL', 'PAR2CHILD', 'EVOL', 'GUIDE_B', 'GUIDE_D', 'RELTRAD', 'RELPER', 'SPIRPER', 'ATTNDPERRLS', 'ATTNDONRLS', 'MEMB', 'GOD', 'HLL', 'SOUL', 'PRAY', 'GRACE', 'PRAC_A', 'PRAC_B', 'EXP_D', 'EXP_G', 'SPIRACT_C', 'SPIRACT_F', 'RTRT', 'SCIMPACT', 'SPIRWORLD2', 'SECBEL2', 'CHIMPREL_A', 'CHIMPREL_B', 'CHATTEND', 'GTHGHT', 'MARITAL', 'RELINST_B', 'RELINST_D', 'RELINST_F', 'SCPRY2', 'RELDISP', 'CHRNAT', 'BIRTHDECADE', 'RACECMB', 'AFROHISP', 'E2', 'USGEN', 'YEARLOCREC', 'MOVED']
features = [
"HAPPY",
"DIVRACPOP",
"QB2A",
"QB2D",
"OPENIDEN",
"GOVSIZE1",
"ABRTLGL",
"PAR2CHILD",
"EVOL",
"GUIDE_B",
"GUIDE_D",
# "RELTRAD",
"RELPER",
"SPIRPER",
"ATTNDPERRLS",
"ATTNDONRLS",
"MEMB",
"GOD",
"HLL",
"SOUL",
"PRAY",
"GRACE",
"PRAC_A",
"PRAC_B",
"EXP_D",
"EXP_G",
"SPIRACT_C",
"SPIRACT_F",
"RTRT",
"SCIMPACT",
"SPIRWORLD2",
"SECBEL2",
"CHIMPREL_A",
"CHIMPREL_B",
"CHATTEND",
"GTHGHT",
"MARITAL",
"RELINST_B",
"RELINST_D",
"RELINST_F",
"SCPRY2",
"RELDISP",
"CHRNAT",
"BIRTHDECADE",
"RACECMB",
"AFROHISP",
"E2",
"USGEN",
"YEARLOCREC",
"MOVED",
]We applied KMeans clustering to a set of respondents using a range of non-religious socio-political and demographic features, including views on gender and sexuality, political ideology, education, income, attitudes toward science and government, and more. The goal was to determine whether individuals naturally group into distinct sociocultural profiles — and to assess how those profiles correspond (if at all) with religious tradition (RELTRAD).
Optimalk with elbow method is k=6.
X = df[features].dropna() # Drop missing values
df_clean = df.loc[X.index] # Preserve indices
# One-hot encode relevant categorical features
categorical_cols = ["MARITAL", "BIRTHDECADE", "RACECMB", "AFROHISP", "E2"]
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Elbow Method to choose k
wcss = []
for i in range(2, 10):
kmeans = KMeans(n_clusters=i, random_state=42)
kmeans.fit(X_scaled)
wcss.append(kmeans.inertia_)
plt.plot(range(2, 10), wcss, marker="o")
plt.title("Elbow Method for Optimal k")
plt.xlabel("Number of clusters")
plt.ylabel("WCSS")
plt.show()Silhouette Score demonstrates that optimal k=2, but there are more than two religions so this may be limiting - we won’t be able to map religions directly but we can possibly see traditionalist religions vs progressive.
K = 2silhouette_scores = []
K_range = range(2, 10)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(X_scaled)
score = silhouette_score(X_scaled, labels)
silhouette_scores.append(score)
plt.figure(figsize=(8, 5))
plt.plot(K_range, silhouette_scores, marker='o')
plt.title('Silhouette Score for different k')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.xticks(K_range)
plt.show()from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df_clean['pca1'] = X_pca[:, 0]
df_clean['pca2'] = X_pca[:, 1]
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_clean, x='pca1', y='pca2', hue='cluster', palette='Set2')
plt.title("KMeans Clustering with k=2 (PCA-reduced)")
plt.show()| RELTRAD | 1100 | 1200 | 1300 | 10000 | 20000 | 30000 | 40001 | 50000 | 60000 | 70000 | 80000 | 100000 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| cluster | ||||||||||||
| 0 | 432 | 1170 | 73 | 1352 | 28 | 48 | 2 | 412 | 47 | 173 | 98 | 6692 |
| 1 | 4435 | 1876 | 737 | 2722 | 334 | 84 | 32 | 109 | 120 | 59 | 66 | 663 |
Cluster profiles by feature (Standardized Average)
cluster_profiles = df_clean.groupby('cluster')[features].mean().T
cluster_profiles.columns = [f'Cluster {i}' for i in cluster_profiles.columns]
cluster_profiles.plot(kind='bar', figsize=(16, 6), colormap='Set2')
plt.title("Cluster Feature Averages (k=2)")
plt.ylabel("Mean value (standardized scale)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()Cluster 0: Religiously Committed & Socially Conservative
SPIRPER, RELPER), and institutional religion variables like ATTNDPERRLS, CHATTEND, and RELINST_*.QB2D), and government size (GOVSIZE1).PRAC_A, PRAC_B, SPIRACT_*, RTRT).AFROHISP, RACECMB) and more stable (e.g., YEARLOCREC, MOVED).Cluster 1: Spiritual/Exploratory & Less Traditional
GOD, HELL, MEMB).OPENIDEN), environmental concern (SCIMPACT), and diversity tolerance (DIVRACPOP).YEARLOCREC and MOVED.AFROHISP, RACECMB.*RELTRAD Distribution per Cluster
reltrad_labels = {
1100: "Evangelical Protestant",
1200: "Mainline Protestant",
1300: "Historically Black Protestant",
10000: "Catholic",
20000: "Mormon",
30000: "Orthodox Christian",
40001: "Jehovah's Witness",
40002: "Other Christian",
50000: "Jewish",
60000: "Muslim",
70000: "Buddhist",
80000: "Hindu",
90001: "Other World Religions",
90002: "Other Faiths",
100000: "Religiously Unaffiliated",
900000: "Don't know/refused"
}
df_clean['RELTRAD_label'] = df_clean['RELTRAD'].map(reltrad_labels)
ct = pd.crosstab(df_clean['cluster'], df_clean['RELTRAD_label'], normalize='index')
ct.plot(kind='bar', stacked=True, figsize=(14, 7), colormap='tab20')
plt.title("RELTRAD Distribution by Cluster (k=2)")
plt.xlabel("Cluster")
plt.ylabel("Proportion")
plt.legend(title="Religious Tradition", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()*CURREL Distribution per Cluster
# Map CURREL codes to readable labels
currel_labels = {
1000: "Protestant",
10000: "Catholic",
20000: "Mormon",
30000: "Orthodox Christian",
40001: "Jehovah's Witness",
40002: "Other Christian",
50000: "Jewish",
60000: "Muslim",
70000: "Buddhist",
80000: "Hindu",
90001: "Other world religions",
90002: "Other faiths",
100000: "Religiously unaffiliated",
900000: "Refused/uninterpretable",
}
# Apply label mapping
df_clean["CURREL_label"] = df_clean["CURREL"].map(currel_labels)
# Optional: exclude ambiguous categories
excluded_currel = [
"Refused/uninterpretable",
"Other Christian",
"Other world religions",
"Other faiths",
]
df_clean_currel = df_clean[~df_clean["CURREL_label"].isin(excluded_currel)]
# Create normalized crosstab
ct_currel = pd.crosstab(
df_clean_currel["cluster"], df_clean_currel["CURREL_label"], normalize="index"
)
# Plot
ct_currel.plot(kind="bar", stacked=True, figsize=(14, 7), colormap="tab20")
plt.title("CURREL Distribution by Cluster (k=2)")
plt.xlabel("Cluster")
plt.ylabel("Proportion")
plt.legend(title="Current Religion", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()K = 6Now we will use k=6 to capture more nuanced sociopoltiical differences and possible capture differences between the two subgroups.
kmeans_5 = KMeans(n_clusters=5, random_state=42)
df_clean['cluster_5'] = kmeans_5.fit_predict(X_scaled)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df_clean['pca1'] = X_pca[:, 0]
df_clean['pca2'] = X_pca[:, 1]
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df_clean, x='pca1', y='pca2', hue='cluster_5', palette='Set2')
plt.title("KMeans Clustering with k=5 (PCA-reduced)")
plt.legend(title='Cluster')
plt.tight_layout()
plt.show()| CURREL | 1000 | 10000 | 20000 | 30000 | 40001 | 50000 | 60000 | 70000 | 80000 | 100000 |
|---|---|---|---|---|---|---|---|---|---|---|
| cluster | ||||||||||
| 0 | 1675 | 1352 | 28 | 48 | 2 | 412 | 47 | 173 | 98 | 6692 |
| 1 | 7048 | 2722 | 334 | 84 | 32 | 109 | 120 | 59 | 66 | 663 |
Cluster profiles by feature (Standardized Average)
cluster_profiles_5 = df_clean.groupby('cluster_5')[features].mean().T
cluster_profiles_5.columns = [f'Cluster {i}' for i in cluster_profiles_5.columns]
cluster_profiles_5.plot(kind='bar', figsize=(16, 6), colormap='Set2')
plt.title("Cluster Feature Averages (k=5)")
plt.ylabel("Mean value (standardized scale)")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()Cluster 0 (Green-teal)
This group tends to be slightly more moderate or mixed overall.
This cluster may represent a blend of individuals with mixed traditional and secular tendencies, possibly more centrist or disengaged.
Cluster 1 (Blue)
This is the most consistently progressive cluster.
GUIDE_B, GUIDE_D)This group appears to be secular, highly educated, and socially liberal — likely young professionals or cultural progressives.
Cluster 2 (Light Green)
The most religiously devout and traditional group.
GOD, HLL, SOUL, PRAY) and practiceMEMB, ATTNDONRLS, CHATTEND)This is a classically religious conservative group with strong spiritual commitments and more traditional moral views.
Cluster 3 (Pink)
Civic-oriented and socially engaged cluster.
IDEO, PARTY), social concern, and group involvementThis group is civically involved and socially inclusive — possibly religious moderates who are socially progressive.
Cluster 4 (Purple-gray)
A spiritually-inclined but skeptical or questioning cluster.
SPIRACT, SPIRPER)SOUL, GRACE)This may reflect the “spiritual but not religious” demographic — interested in meaning and morality but detached from institutions.
*RELTRAD Distribution per Cluster
# Optional: drop "refused" or unclear categories
excluded = ["Don't know/refused", "Other Christian", "Other World Religions", "Other Faiths"]
df_clean = df_clean[~df_clean['RELTRAD_label'].isin(excluded)]
# RELTRAD by cluster
ct_5 = pd.crosstab(df_clean['cluster_5'], df_clean['RELTRAD_label'], normalize='index')
ct_5.plot(kind='bar', stacked=True, figsize=(14, 7), colormap='tab20')
plt.title("RELTRAD Distribution by Cluster (k=5)")
plt.xlabel("Cluster")
plt.ylabel("Proportion")
plt.legend(title="Religious Tradition", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()Summary Insight: - Clusters 1 and 0 lean most strongly toward institutional religiosity, especially with Evangelical and Historically Black Protestant affiliations. - Cluster 2 is the most clearly secular, dominated by the unaffiliated. - Clusters 3 and 4 fall in the middle, combining moderate religious identity with some degree of secularization or pluralism.
This distribution further reinforces the idea that religious identity and worldview are not one-to-one — even highly religious groups like Evangelicals appear across clusters, but their dominant presence in one cluster signals ideological clustering within religion itself.
*CURREL Distribution per Cluster
kmeans_5 = KMeans(n_clusters=5, random_state=42)
df_clean['cluster_5'] = kmeans_5.fit_predict(X_scaled)
df_clean_currel = df_clean[~df_clean['CURREL_label'].isin(excluded_currel)]
ct_currel_5 = pd.crosstab(df_clean_currel['cluster_5'], df_clean_currel['CURREL_label'], normalize='index')
# Plot the stacked bar chart
ct_currel_5.plot(kind='bar', stacked=True, figsize=(14, 7), colormap='tab20')
plt.title("CURREL Distribution by Cluster (k=5)")
plt.xlabel("Cluster")
plt.ylabel("Proportion")
plt.legend(title="Current Religion", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()Cluster 2 is by far the most secular cluster, with the Religiously Unaffiliated making up nearly the entire group.
Clusters 0 and 3 show a strong Protestant majority, which is much more pronounced under CURREL than in RELTRAD.
Cluster 1, while also Protestant-heavy, includes a notable presence of Mormon, Muslim, and Catholic identifiers — giving it a more institutionally religious and theologically diverse makeup than others.
Cluster 4 reflects a more mixed composition, with moderate shares of both Protestants and the Unaffiliated, plus some Jewish and Catholic identifiers — suggesting a less clearly polarized religious identity.
Key Insight: - RELTRAD and CURREL tell subtly different stories.
- CURREL simplifies religious identity into broad categories, exaggerating the Protestant/Unaffiliated split.
- RELTRAD captures within-group diversity — showing that, for example, not all Protestants cluster the same way.
So while “religion” as a single label might feel like a strong predictor, it’s really the granularity of religious identification that matters in understanding sociopolitical alignment.
political_features = ["GOVSIZE1", "IDEO", "PARTY", "POORASSIST"]
X = df[political_features].dropna()
df_subset = df.loc[X.index]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Silhouette to find best k
sil_scores = []
K_range = range(2, 10)
for k in K_range:
labels = KMeans(n_clusters=k, random_state=42).fit_predict(X_scaled)
sil_scores.append(silhouette_score(X_scaled, labels))
# Final clustering
best_k = K_range[sil_scores.index(max(sil_scores))]
cluster_labels = KMeans(n_clusters=best_k, random_state=42).fit_predict(X_scaled)
df_subset["cluster"] = cluster_labels
# PCA scatter
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df_subset["pca1"], df_subset["pca2"] = X_pca[:, 0], X_pca[:, 1]
plt.figure(figsize=(7, 6))
sns.scatterplot(data=df_subset, x="pca1", y="pca2", hue="cluster", palette="Set2")
plt.title(f"PCA Clustering - Political Features (k={best_k})")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()
# Cluster feature averages
cluster_means = df_subset.groupby("cluster")[political_features].mean().T
cluster_means.columns = [f"Cluster {i}" for i in cluster_means.columns]
cluster_means.plot(kind="bar", figsize=(10, 5), colormap="Set2")
plt.title("Cluster Feature Averages - Political")
plt.ylabel("Standardized Mean")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# RELTRAD distribution
df_subset["RELTRAD_label"] = df_subset["RELTRAD"].map(reltrad_labels)
pd.crosstab(df_subset["cluster"], df_subset["RELTRAD_label"], normalize="index")\
.plot(kind="bar", stacked=True, figsize=(12, 6), colormap="tab20")
plt.title("RELTRAD by Cluster - Political Features")
plt.ylabel("Proportion")
plt.xlabel("Cluster")
plt.legend(title="Religious Tradition", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()
# CURREL distribution
df_subset["CURREL_label"] = df_subset["CURREL"].map(currel_labels)
df_subset_currel = df_subset[~df_subset["CURREL_label"].isin([
"Other Christian", "Other world religions", "Other faiths", "Refused/uninterpretable"
])]
pd.crosstab(df_subset_currel["cluster"], df_subset_currel["CURREL_label"], normalize="index")\
.plot(kind="bar", stacked=True, figsize=(12, 6), colormap="tab20")
plt.title("CURREL by Cluster - Political Features")
plt.ylabel("Proportion")
plt.xlabel("Cluster")
plt.legend(title="Current Religion", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()Political Cluster Interpretation (k=4)
Cluster 0: Politically moderate-progressive. Mid-to-high on government support for the poor and more liberal (IDEO, PARTY), but slightly lower on trust in large government (GOVSIZE1).
Cluster 1: More conservative. Scores lower on liberal ideology, party identification, and assistance programs. Tends toward traditionalist values.
Cluster 2: Highly liberal-progressive cluster. Very high scores on political ideology (IDEO) and party affiliation, strong support for government aid.
Cluster 3: Politically disengaged or centrist. Lowest scores across the board — could reflect apolitical or ideologically neutral individuals.
RELTRAD & CURREL Distributions:
Key Insight: Political identity is tightly tied to religious affiliation in this dataset — Evangelicals cluster together conservatively, while the unaffiliated and Mainline Protestants show more liberal leanings.
practice_features = [
"ATTNDPERRLS", "ATTNDONRLS", "PRAY", "MEMB", "CHATTEND",
"RELINST_B", "RELINST_D", "RELINST_F", "PRAC_A", "PRAC_B", "GRACE"
]
X = df[practice_features].dropna()
df_subset = df.loc[X.index]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Select best k using silhouette score (no plot)
sil_scores = []
K_range = range(2, 10)
for k in K_range:
labels = KMeans(n_clusters=k, random_state=42).fit_predict(X_scaled)
sil_scores.append(silhouette_score(X_scaled, labels))
best_k = K_range[sil_scores.index(max(sil_scores))]
cluster_labels = KMeans(n_clusters=best_k, random_state=42).fit_predict(X_scaled)
df_subset["cluster"] = cluster_labels
# PCA scatter
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df_subset["pca1"], df_subset["pca2"] = X_pca[:, 0], X_pca[:, 1]
plt.figure(figsize=(7, 6))
sns.scatterplot(data=df_subset, x="pca1", y="pca2", hue="cluster", palette="Set2")
plt.title(f"PCA Clustering - Religious Practice (k={best_k})")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()
# Cluster feature averages
cluster_means = df_subset.groupby("cluster")[practice_features].mean().T
cluster_means.columns = [f"Cluster {i}" for i in cluster_means.columns]
cluster_means.plot(kind="bar", figsize=(10, 5), colormap="Set2")
plt.title("Cluster Feature Averages - Religious Practice")
plt.ylabel("Standardized Mean")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# RELTRAD bar
df_subset["RELTRAD_label"] = df_subset["RELTRAD"].map(reltrad_labels)
pd.crosstab(df_subset["cluster"], df_subset["RELTRAD_label"], normalize="index")\
.plot(kind="bar", stacked=True, figsize=(12, 6), colormap="tab20")
plt.title("RELTRAD by Cluster - Religious Practice")
plt.ylabel("Proportion")
plt.xlabel("Cluster")
plt.legend(title="Religious Tradition", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()
# CURREL bar
df_subset["CURREL_label"] = df_subset["CURREL"].map(currel_labels)
df_subset_currel = df_subset[~df_subset["CURREL_label"].isin([
"Other Christian", "Other world religions", "Other faiths", "Refused/uninterpretable"
])]
pd.crosstab(df_subset_currel["cluster"], df_subset_currel["CURREL_label"], normalize="index")\
.plot(kind="bar", stacked=True, figsize=(12, 6), colormap="tab20")
plt.title("CURREL by Cluster - Religious Practice")
plt.ylabel("Proportion")
plt.xlabel("Cluster")
plt.legend(title="Current Religion", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()Religious Practice Clustering (k=2)
ATTNDPERRLS, ATTNDONRLS) and higher frequency of prayer (PRAY).MEMB) and greater participation in religious rituals or expressions (PRAC_A, PRAC_B, GRACE).RELTRAD distribution: - Cluster 0 includes a large share of the Religiously Unaffiliated, along with Catholics and Mainline Protestants. - Cluster 1 is strongly dominated by Evangelical Protestants, with additional presence from Historically Black Protestants and Catholics. - The divide aligns closely with levels of religious participation — more frequent participation tracks with Evangelical identity.
CURREL distribution: - Cluster 0 is more secular overall — Religiously Unaffiliated make up the largest single group. - Cluster 1 is overwhelmingly Protestant, suggesting that current affiliation with a religious tradition correlates strongly with higher levels of religious engagement.
These clusters show a clear division between institutionally religious individuals (Cluster 1) and those who are less religiously active or unaffiliated (Cluster 0).
spiritual_features = [
"PRAY", "GRACE", "PRAC_A", "PRAC_B", "EXP_D", "EXP_G", "SPIRACT_C",
"SPIRACT_F", "RTRT", "SCIMPACT", "SPIRWORLD2", "SECBEL2"
]
X = df[spiritual_features].dropna()
df_subset = df.loc[X.index]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Best k by silhouette (no plot shown)
sil_scores = []
K_range = range(2, 10)
for k in K_range:
labels = KMeans(n_clusters=k, random_state=42).fit_predict(X_scaled)
sil_scores.append(silhouette_score(X_scaled, labels))
best_k = K_range[sil_scores.index(max(sil_scores))]
cluster_labels = KMeans(n_clusters=best_k, random_state=42).fit_predict(X_scaled)
df_subset["cluster"] = cluster_labels
# PCA scatter
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df_subset["pca1"], df_subset["pca2"] = X_pca[:, 0], X_pca[:, 1]
plt.figure(figsize=(7, 6))
sns.scatterplot(data=df_subset, x="pca1", y="pca2", hue="cluster", palette="Set2")
plt.title(f"PCA Clustering - Spiritual Practice (k={best_k})")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()
# Cluster means plot
cluster_means = df_subset.groupby("cluster")[spiritual_features].mean().T
cluster_means.columns = [f"Cluster {i}" for i in cluster_means.columns]
cluster_means.plot(kind="bar", figsize=(12, 5), colormap="Set2")
plt.title("Cluster Feature Averages - Spiritual Practice")
plt.ylabel("Standardized Mean")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# RELTRAD distribution
df_subset["RELTRAD_label"] = df_subset["RELTRAD"].map(reltrad_labels)
pd.crosstab(df_subset["cluster"], df_subset["RELTRAD_label"], normalize="index")\
.plot(kind="bar", stacked=True, figsize=(12, 6), colormap="tab20")
plt.title("RELTRAD by Cluster - Spiritual Practice")
plt.ylabel("Proportion")
plt.xlabel("Cluster")
plt.legend(title="Religious Tradition", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()
# CURREL distribution
df_subset["CURREL_label"] = df_subset["CURREL"].map(currel_labels)
df_subset_currel = df_subset[~df_subset["CURREL_label"].isin([
"Other Christian", "Other world religions", "Other faiths", "Refused/uninterpretable"
])]
pd.crosstab(df_subset_currel["cluster"], df_subset_currel["CURREL_label"], normalize="index")\
.plot(kind="bar", stacked=True, figsize=(12, 6), colormap="tab20")
plt.title("CURREL by Cluster - Spiritual Practice")
plt.ylabel("Proportion")
plt.xlabel("Cluster")
plt.legend(title="Current Religion", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()Cluster 0
- Demonstrates high engagement in spiritual activities:
- Frequent prayer (PRAY), grace before meals (GRACE), and both formal and informal religious practice (PRAC_A, PRAC_B)
- Strong spiritual experiences and activities (EXP_D, EXP_G, SPIRACT_C, SPIRACT_F)
- Elevated retreat attendance (RTRT)
- Surprisingly, this spiritually active group is dominated by the religiously unaffiliated — especially in both RELTRAD and CURREL. - That is, many in this cluster do not identify with a formal religion, but still engage deeply in spiritual behavior.
Cluster 1
- Displays lower but still present levels of spiritual activity:
- Less ritual participation and fewer experiential indicators
- Compositionally, this cluster is made up of more traditionally affiliated individuals — especially Evangelical Protestants and Catholics, as shown in both religion variables.
This clustering flips common assumptions: the most spiritually active group is not the most traditionally affiliated. In fact, those who identify as religiously unaffiliated dominate the high-engagement cluster — similar to what was observed in religious practice clustering, where Cluster 0 also showed both high practice levels and high unaffiliated rates.
Behavioral measures of spirituality (like prayer or retreat attendance) are not tightly bound to religious labels. Many “unaffiliated” individuals are deeply engaged in practice, while some traditionally affiliated groups show more nominal participation.
This mirrors the pattern seen in religious practice clustering, reinforcing the idea that spiritual action and religious identity are increasingly decoupled in contemporary contexts.
demo_features = [
"BIRTHDECADE", "RACECMB", "AFROHISP", "E2", "USGEN", "YEARLOCREC", "MOVED", "MARITAL"
]
X = df[demo_features].dropna()
df_subset = df.loc[X.index]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Best k by silhouette (no plot)
sil_scores = []
K_range = range(2, 10)
for k in K_range:
labels = KMeans(n_clusters=k, random_state=42).fit_predict(X_scaled)
sil_scores.append(silhouette_score(X_scaled, labels))
best_k = K_range[sil_scores.index(max(sil_scores))]
cluster_labels = KMeans(n_clusters=best_k, random_state=42).fit_predict(X_scaled)
df_subset["cluster"] = cluster_labels
# PCA plot
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df_subset["pca1"], df_subset["pca2"] = X_pca[:, 0], X_pca[:, 1]
plt.figure(figsize=(7, 6))
sns.scatterplot(data=df_subset, x="pca1", y="pca2", hue="cluster", palette="Set2")
plt.title(f"PCA Clustering - Demographics (k={best_k})")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()
# Cluster feature averages
cluster_means = df_subset.groupby("cluster")[demo_features].mean().T
cluster_means.columns = [f"Cluster {i}" for i in cluster_means.columns]
cluster_means.plot(kind="bar", figsize=(10, 5), colormap="Set2")
plt.title("Cluster Feature Averages - Demographics")
plt.ylabel("Standardized Mean")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# RELTRAD bar
df_subset["RELTRAD_label"] = df_subset["RELTRAD"].map(reltrad_labels)
pd.crosstab(df_subset["cluster"], df_subset["RELTRAD_label"], normalize="index")\
.plot(kind="bar", stacked=True, figsize=(12, 6), colormap="tab20")
plt.title("RELTRAD by Cluster - Demographics")
plt.ylabel("Proportion")
plt.xlabel("Cluster")
plt.legend(title="Religious Tradition", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()
# CURREL bar
df_subset["CURREL_label"] = df_subset["CURREL"].map(currel_labels)
df_subset_currel = df_subset[~df_subset["CURREL_label"].isin([
"Other Christian", "Other world religions", "Other faiths", "Refused/uninterpretable"
])]
pd.crosstab(df_subset_currel["cluster"], df_subset_currel["CURREL_label"], normalize="index")\
.plot(kind="bar", stacked=True, figsize=(12, 6), colormap="tab20")
plt.title("CURREL by Cluster - Demographics")
plt.ylabel("Proportion")
plt.xlabel("Cluster")
plt.legend(title="Current Religion", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()Cluster 0 consists of individuals who are generally younger, more racially and ethnically diverse (RACECMB, AFROHISP), and more mobile (MOVED). They are slightly more likely to be unmarried and less rooted in their current community.
Cluster 1, in contrast, is older, more likely to be white, and more often married. They also report more geographic stability and higher likelihood of U.S. nativity (USGEN), indicating a more settled and possibly traditional demographic profile.
From a religious identity perspective:
In terms of CURREL, this same trend is visible: Cluster 0 leans more secular, while Cluster 1 shows greater Protestant and Catholic affiliation.
While these demographic clusters reflect meaningful variation in age, race, and stability, the religious differences between them are not as stark as those seen in clusters based on religious practice or values. This suggests that demographics alone don’t strongly segment religious worldview, but they may help shape predispositions.
science_features = [
"EVOL", "GUIDE_B", "GUIDE_D", "GTHGHT", "SCPRY2", "RELDISP"
]
X = df[science_features].dropna()
df_subset = df.loc[X.index]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Best silhouette-based k
sil_scores = []
K_range = range(2, 10)
for k in K_range:
labels = KMeans(n_clusters=k, random_state=42).fit_predict(X_scaled)
sil_scores.append(silhouette_score(X_scaled, labels))
best_k = K_range[sil_scores.index(max(sil_scores))]
cluster_labels = KMeans(n_clusters=best_k, random_state=42).fit_predict(X_scaled)
df_subset["cluster"] = cluster_labels
# PCA plot
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
df_subset["pca1"], df_subset["pca2"] = X_pca[:, 0], X_pca[:, 1]
plt.figure(figsize=(7, 6))
sns.scatterplot(data=df_subset, x="pca1", y="pca2", hue="cluster", palette="Set2")
plt.title(f"PCA Clustering - Science & Worldview (k={best_k})")
plt.legend(title="Cluster")
plt.tight_layout()
plt.show()
# Cluster feature averages
cluster_means = df_subset.groupby("cluster")[science_features].mean().T
cluster_means.columns = [f"Cluster {i}" for i in cluster_means.columns]
cluster_means.plot(kind="bar", figsize=(10, 5), colormap="Set2")
plt.title("Cluster Feature Averages - Science & Worldview")
plt.ylabel("Standardized Mean")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# RELTRAD breakdown
df_subset["RELTRAD_label"] = df_subset["RELTRAD"].map(reltrad_labels)
pd.crosstab(df_subset["cluster"], df_subset["RELTRAD_label"], normalize="index")\
.plot(kind="bar", stacked=True, figsize=(12, 6), colormap="tab20")
plt.title("RELTRAD by Cluster - Science & Worldview")
plt.ylabel("Proportion")
plt.xlabel("Cluster")
plt.legend(title="Religious Tradition", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()
# CURREL breakdown
df_subset["CURREL_label"] = df_subset["CURREL"].map(currel_labels)
df_subset_currel = df_subset[~df_subset["CURREL_label"].isin([
"Other Christian", "Other world religions", "Other faiths", "Refused/uninterpretable"
])]
pd.crosstab(df_subset_currel["cluster"], df_subset_currel["CURREL_label"], normalize="index")\
.plot(kind="bar", stacked=True, figsize=(12, 6), colormap="tab20")
plt.title("CURREL by Cluster - Science & Worldview")
plt.ylabel("Proportion")
plt.xlabel("Cluster")
plt.legend(title="Current Religion", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()Cluster differences on science and metaphysical worldview are present but less stark than in domains like religious practice.
EVOL, GUIDE_B, GUIDE_D).
RELIDISP (religious distrust of science) are relatively neutral, not fully secular.GTHGHT (sense of purpose) and SCPRY2 (spiritual practices), combined with lower evolution endorsement, suggest a more spiritual-but-not-scientific orientation.These clusters suggest that worldview-based divisions are real but less polarized than in direct measures of belief or religious practice. The relationship between science, purpose, and belief appears more complex — possibly shaped by culture, identity, and education as much as formal religion.